1 Setup

suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/ZE_Dataset_v3.csv",
                    col_types = cols(col_character(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_double(),
                                     col_double())) %>% 
  mutate(Tissue=factor(Tissue)) %>% 
  mutate(Time=factor(Time))

2 Analysis

2.1 Raw data

lb.filelist <- list.files(here("data/RNA-Seq/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Filter and select only one duplicate from each sample.

lb.filterlist <- str_subset(lb.filelist,"L001_sortmerna_trimmomatic")

#removed P11562_112 trimmomatic dataset
lb.filterlist <- (str_subset(lb.filterlist, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L001_sortmerna_trimmomatic/quant.sf", negate = TRUE))
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("112",NGI.ID))
#samples <- filter(samples, !grepl("P11562_201",NGI.ID)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
#lb.filterlist <- (str_subset(lb.filterlist, "P11562_201", negate = TRUE)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S




##extract S tissue rows
##remove those rows from original
trim_samples <- subset(samples, samples$Tissue == "S")
samples <- subset(samples, samples$Tissue != "S")
samples
## # A tibble: 47 x 8
##    NGI.ID     User.ID     Time  Sample.ID Tissue Replicate Mreads X..Q30
##    <chr>      <chr>       <fct> <chr>     <fct>  <chr>      <dbl>  <dbl>
##  1 P11562_110 PabB4C2_FMG B4    B4C2      FMG    B4-FMG     13.1    91.4
##  2 P11562_111 PabB4C2_ZE  B4    B4C2      ZE     B4-ZE      16.6    93.0
##  3 P11562_113 PabB4C4_ZE  B4    B4C4      ZE     B4-ZE      17.0    93.4
##  4 P11562_114 PabB4C5_FMG B4    B4C5      FMG    B4-FMG     12.7    89.9
##  5 P11562_115 PabB4C5_ZE  B4    B4C5      ZE     B4-ZE      12.5    91.1
##  6 P11562_116 PabB5C1_FMG B5    B5C1      FMG    B5-FMG      9.46   91.8
##  7 P11562_117 PabB5C1_ZE  B5    B5C1      ZE     B5-ZE      15.6    92.1
##  8 P11562_118 PabB5C5_FMG B5    B5C5      FMG    B5-FMG     14.1    90.2
##  9 P11562_119 PabB5C5_ZE  B5    B5C5      ZE     B5-ZE      13.8    90.2
## 10 P11562_120 PabB5C6_FMG B5    B5C6      FMG    B5-FMG     16.4    90  
## # … with 37 more rows
#trim_samples

##match NGI.IDs to filelist
##remove files
lb.filterlist_trim <- lb.filterlist
lb.filterlist_trim <- str_subset(lb.filterlist_trim, c(trim_samples$NGI.ID))
## Warning in stri_subset_regex(string, pattern, omit_na = TRUE, negate =
## negate, : longer object length is not a multiple of shorter object length
lb.filterlist_trim <- append(lb.filterlist_trim, c(str_subset(lb.filterlist, "P11562_201")))
#
#
lb.filterlist <- str_subset(lb.filterlist, c(trim_samples$NGI.ID), negate = TRUE)
## Warning in stri_subset_regex(string, pattern, omit_na = TRUE, negate =
## negate, : longer object length is not a multiple of shorter object length
lb.filterlist <- str_subset(lb.filterlist, "P11562_201", negate = TRUE)
#
#
#double_A <- trim_samples
#double_B <- trim_samples
#double_A$NGI.ID <- str_replace(double_A$NGI.ID,"P11562_","P11562_A_")
#double_B$NGI.ID <- str_replace(double_B$NGI.ID,"P11562_","P11562_B_")
#double_A$Tissue <- str_replace(double_A$Tissue,"S","FMG")
#double_B$Tissue <- str_replace(double_B$Tissue,"S","ZE")
#double_A$Replicate <- str_replace(double_A$Replicate,"S","FMG")
#double_B$Replicate <- str_replace(double_B$Replicate,"S","ZE")
#
#lb.filterlist_trim_A <- lb.filterlist_trim
#lb.filterlist_trim_B <- lb.filterlist_trim







stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
#names(lb.filterlist_trim_A) <- double_A$NGI.ID
#names(lb.filterlist_trim_B) <- double_B$NGI.ID

lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG")] ##Are we only looking at ZE?
#lb.filterlist_trim_A <- lb.filterlist_trim_A[double_A$Tissue %in% c("FMG")] ##Are we only looking at ZE?
#lb.filterlist_trim_B <- lb.filterlist_trim_B[double_B$Tissue %in% c("ZE")] ##Are we only looking at ZE?


#samples <- bind_rows(samples,double_A,double_B)
samples <- subset(samples, select = -c(User.ID, Sample.ID, Replicate, Mreads, X..Q30) )

#, lb.filterlist_trim_A, lb.filterlist_trim_B

Read the expression at the gene level (there is one transcript per gene)

lb.g <- suppressMessages(tximport(files = c(lb.filterlist), 
                                  type = "salmon",txOut=TRUE))

counts <- round(lb.g$counts)

2.2 Raw Data QC analysis

Check how many genes are never expressed - reasonable level of non-expressed genes indicated.

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "21.9% percent (14484) of 66069 genes are not expressed"

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$NGI.ID)])

2.3 Export

dir.create(here("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/salmon/ZE-ZF-unnormalised-gene-expression_data.csv"))
############## change export location name

2.4 Data normalisation

2.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Time * Tissue)
## converting counts to integer mode
## factor levels were dropped which had no samples
#want to duplicate S here, rename each twin to FMG and ZE, and give them unique NGI.IDs



save(dds,file=here("analysis/salmon/ZE-ZF-Dataset-dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively variable (0 to 200 %) however the low end of the variance is likely due to poor samples where little sequencing data was actually produce that wasnt 16s bacterial.

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11562_110 P11562_111 P11562_113 P11562_114 P11562_115 P11562_116
0.492 1.509 1.352 0.6335 0.9565 0.2825
Table continues below
P11562_117 P11562_118 P11562_119 P11562_120 P11562_121 P11562_123
1.147 0.526 1.003 0.6942 1.286 1.086
Table continues below
P11562_124 P11562_125 P11562_126 P11562_127 P11562_128 P11562_129
0.4092 1.17 0.5539 1.503 1.061 1.206
Table continues below
P11562_130 P11562_131 P11562_133 P11562_134 P11562_135 P11562_136
1.079 1.19 1.697 1.138 1.515 1.234
Table continues below
P11562_137 P11562_138 P11562_139 P11562_140 P11562_141 P11562_142
1.037 0.8039 1.053 1.017 1.676 1.535
Table continues below
P11562_143 P11562_145 P11562_146 P11562_147 P11562_148 P11562_149
0.9657 1.448 0.8732 0.9698 1.709 1.019
Table continues below
P11562_150 P11562_151 P11562_153 P11562_154 P11562_155 P11562_157
1.57 1.244 0.5682 1.044 1.726 1.196
P11562_202 P11562_203 P11562_204 P11562_205 P11562_206
1.071 1.266 1.065 1.069 0.6645
boxplot(sizes, main="Sequencing libraries size factor")

2.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked very well, the data is nearly homoscesdastic

meanSdPlot(vst[rowSums(counts)>0,])

2.6 QC on the normalised data

2.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)
plot(cumsum(percent))

2.6.2 3 first dimensions

Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples. ### 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples)

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

2.6.3 Interactive PCA Plot

suppressPackageStartupMessages(library(plotly))

interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
  geom_point(size=2) +
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
                       yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

2.6.4 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?

conds <- factor(paste(samples$Tissue,samples$Time))
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot

vstCutoff <- 4+1
  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples. It appears that generally there is little difference between the samples across all genes
  • a small difference is noticable between ZE tissues and FMG-S tissues, however generally the expression levels are relatively balanced apart from the one sample in FMG B4. Somatic samples had a small section of more highly expressed genes compared to ZE and FMG.
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

2.7 Conclusion ###################### make separate notes for this.

The data appears to show a correlation between Tissue and Time, with some clustering in the PCA plot showing a movement of time in ascending order from right to left (+ to -) along the X axis. There can also be seen that along the Y axis, there are subgroups which correlate to the Tissue type. Along with this, it appears that overall gene expression does not appear starkly different but slight differences can be observed between Tissue types. It appears that the most difference between Tissues can be seen in the 1000 most variable genes, with some small changes between Time points within those Tissue groups. The final heat map of the 1000 least variable genes appear to show a very mixed expression pattern, similar in appearance to the total gene heat map, between Tissue groups and Time points, except for in the Somatic tissue type.

3 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] here_0.1                    vsn_3.52.0                 
##  [3] tximport_1.12.3             forcats_0.4.0              
##  [5] stringr_1.4.0               dplyr_0.8.3                
##  [7] purrr_0.3.2                 readr_1.3.1                
##  [9] tidyr_0.8.3                 tibble_2.1.3               
## [11] tidyverse_1.2.1             scatterplot3d_0.3-41       
## [13] RColorBrewer_1.1-2          plotly_4.9.0               
## [15] pander_0.6.3                hyperSpec_0.99-20180627    
## [17] ggplot2_3.2.1               lattice_0.20-38            
## [19] gplots_3.0.1.1              DESeq2_1.24.0              
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
## [23] BiocParallel_1.18.1         matrixStats_0.54.0         
## [25] Biobase_2.44.0              GenomicRanges_1.36.0       
## [27] GenomeInfoDb_1.20.0         IRanges_2.18.2             
## [29] S4Vectors_0.22.0            BiocGenerics_0.30.0        
## [31] data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       rprojroot_1.3-2        htmlTable_1.13.1      
##   [4] XVector_0.24.0         base64enc_0.1-3        rstudioapi_0.10       
##   [7] hexbin_1.27.3          affyio_1.54.0          bit64_0.9-7           
##  [10] fansi_0.4.0            AnnotationDbi_1.46.1   lubridate_1.7.4       
##  [13] xml2_1.2.2             splines_3.6.1          geneplotter_1.62.0    
##  [16] knitr_1.24             zeallot_0.1.0          Formula_1.2-3         
##  [19] jsonlite_1.6           broom_0.5.2            annotate_1.62.0       
##  [22] cluster_2.1.0          shiny_1.3.2            BiocManager_1.30.4    
##  [25] compiler_3.6.1         httr_1.4.1             backports_1.1.4       
##  [28] assertthat_0.2.1       Matrix_1.2-17          lazyeval_0.2.2        
##  [31] limma_3.40.6           cli_1.1.0              later_0.8.0           
##  [34] acepack_1.4.1          htmltools_0.3.6        tools_3.6.1           
##  [37] gtable_0.3.0           glue_1.3.1             GenomeInfoDbData_1.2.1
##  [40] affy_1.62.0            Rcpp_1.0.2             cellranger_1.1.0      
##  [43] vctrs_0.2.0            preprocessCore_1.46.0  gdata_2.18.0          
##  [46] nlme_3.1-141           crosstalk_1.0.0        xfun_0.9              
##  [49] testthat_2.2.1         rvest_0.3.4            mime_0.7              
##  [52] gtools_3.8.1           XML_3.98-1.20          zlibbioc_1.30.0       
##  [55] scales_1.0.0           promises_1.0.1         hms_0.5.1             
##  [58] yaml_2.2.0             memoise_1.1.0          gridExtra_2.3         
##  [61] rpart_4.1-15           latticeExtra_0.6-28    stringi_1.4.3         
##  [64] RSQLite_2.1.2          highr_0.8              genefilter_1.66.0     
##  [67] checkmate_1.9.4        caTools_1.17.1.2       rlang_0.4.0           
##  [70] pkgconfig_2.0.2        bitops_1.0-6           evaluate_0.14         
##  [73] labeling_0.3           htmlwidgets_1.3        bit_1.1-14            
##  [76] tidyselect_0.2.5       magrittr_1.5           R6_2.4.0              
##  [79] generics_0.0.2         Hmisc_4.2-0            DBI_1.0.0             
##  [82] pillar_1.4.2           haven_2.1.1            foreign_0.8-72        
##  [85] withr_2.1.2            survival_2.44-1.1      RCurl_1.95-4.12       
##  [88] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
##  [91] utf8_1.1.4             KernSmooth_2.23-15     rmarkdown_1.15        
##  [94] locfit_1.5-9.1         readxl_1.3.1           blob_1.2.0            
##  [97] digest_0.6.20          xtable_1.8-4           httpuv_1.5.1          
## [100] munsell_0.5.0          viridisLite_0.3.0